contour_RFT_1000_legend.pdf
p1C + ggrepel::geom_text_repel(
aes(label = modelshort, col = type),
size = 3, # Increased label size
max.overlaps = Inf,
min.segment.length = Inf,
show.legend = FALSE
)

# Extract the legend and save it ----
# This makes it easy to arrange things on slides by giving a separate
# image for the legend.
legend <- cowplot::get_legend(p1C)
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
# Plot only the legend
legend_plot <- ggdraw(legend)
# legend_plot
# Save the legend as a separate image
#ggsave("graphs/sree2024/legend_only.png", plot = legend_plot, width = 5, height = 5)
# Now make the plot without the legend
dd = filter( df, cov_set_size == "small", train_set_size == 1000, queen == "RF T" )
p1C <- ggplot( no_core( dd ), aes(se, bias) ) +
contour_plot_base( max_bias = max_bias, max_se = max_se, step = .1, include_scales = FALSE ) +
geom_point( aes(color = type), size = 4, shape=19 ) +
geom_point( data = only_core( dd ), aes(pch=model), col="darkgrey", fill="darkgrey", size=4 ) +
# geom_point( data = gbs, size = 7, shape = 1, col="red" ) +
scale_shape_manual(breaks = c( "ATE", "OLS S", "LASSO INF", "RF INF" ),
values = c( 15, 23, 25, 24 ) ) +
guides(color = guide_legend(title = "Model Type"),
shape = guide_legend(title = "Reference") ) +
coord_fixed( xlim=c(0, max_se) ) +
theme( legend.position = "none" )
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
contour_RFT_1000.pdf
p1C + ggrepel::geom_text_repel(
aes(label = modelshort, col = type),
size = 3, # Increased label size
max.overlaps = Inf,
min.segment.length = Inf,
show.legend = FALSE
)

#ggsave( filename = "graphs/sree2024/contour_RFT_1000.pdf", width = IMAGE_WIDTH, height = 5 )
# Plot averaged across queens ----
table( df$queen )
##
## ATE BART T CDML CF LASSO MCM EA LASSO R
## 356 356 356 356 158 356
## OLS S RF MOM IPW RF T SL T XGBOOST R
## 356 356 356 356 356
dd = filter( ca1_agg, outcome_index == 1,
cov_set_size == "small",
train_set_size == 1000 )
p1C <- ggplot( no_core( dd ), aes(se, bias) ) +
contour_plot_base( max_bias = max_bias, max_se = max_se, step = .1, include_scales = FALSE ) +
geom_point( aes(color = type), size = 4, shape=19 ) +
geom_point( data = only_core( dd ), aes(pch=model), col="darkgrey", fill="darkgrey", size=4 ) +
# geom_point( data = gbs, size = 7, shape = 1, col="red" ) +
scale_shape_manual(breaks = c( "ATE", "OLS S", "LASSO INF", "RF INF" ),
values = c( 15, 23, 25, 24 ) ) +
guides(color = guide_legend(title = "Model Type"),
shape = guide_legend(title = "Reference") ) +
coord_fixed( xlim=c(0, max_se) ) +
theme( legend.position = "none" )
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
contour_all_1000.pdf
p1C + ggrepel::geom_text_repel(
aes(label = modelshort, col = type),
size = 3, # Increased label size
max.overlaps = Inf,
min.segment.length = Inf,
show.legend = FALSE
)

#ggsave( filename = "graphs/sree2024/contour_all_1000.pdf", width = IMAGE_WIDTH, height = 5 )
# Moving to large covariate set plot ----
d_b = make_trail_set( filter( ca1_agg,
train_set_size == 1000,
outcome_index == 1,
cov_set_size=="small"),
filter( ca1_agg,
train_set_size == 1000,
outcome_index == 1,
cov_set_size=="large" ) )
pltB <- ggplot( no_core( d_b ), aes(se, bias) ) +
contour_plot_base( max_bias = max_bias, max_se = max_se, step = .1,include_scales = FALSE ) +
geom_segment( data= d_b, aes(x = se_pre, y = bias_pre, xend = se, yend = bias),
#arrow = arrow(length = unit(0.2, "cm")),
lwd = 1, col="lightgrey") +
geom_point( data = no_core( d_b ), aes(color = type), size = 4, shape=19 ) +
geom_point( data = only_core( d_b ), aes(pch=model), col="darkgrey", fill="darkgrey", size=4 ) +
#labs( title = "Trails of Models from 1000 to 5000 training size",
# subtitle = "Aggregated over scenarios and queens" ) +
#scale_size_identity() +
scale_shape_manual(breaks = c( "ATE", "OLS S", "LASSO INF", "RF INF" ),
values = c( 15, 23, 25, 24 ) ) +
guides(color = guide_legend(title = "Model Type"),
shape = guide_legend(title = "Reference") ) +
coord_fixed( xlim=c(0, max_se) ) +
theme( legend.position = "none" )
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

contour_trail_to_large.pdf
pltB + ggrepel::geom_text_repel(
aes(label = modelshort, col = type),
size = 3, # Increased label size
max.overlaps = Inf,
min.segment.length = Inf,
show.legend = FALSE
)

#ggsave( filename = "graphs/sree2024/contour_trail_to_large.pdf",
# width = IMAGE_WIDTH, height = 5 )
# Moving to large training set plot ----
d_b = make_trail_set( filter( ca1_agg,
outcome_index == 1,
train_set_size == 1000,
cov_set_size=="large"),
filter( ca1_agg, train_set_size == 5000, cov_set_size=="large" ) )
pltB <- ggplot( no_core( d_b ), aes(se, bias) ) +
contour_plot_base( max_bias = max_bias, max_se = max_se,step = .1, include_scales = FALSE ) +
geom_segment( data= d_b, aes(x = se_pre, y = bias_pre, xend = se, yend = bias),
#arrow = arrow(length = unit(0.2, "cm")),
lwd = 1, col="lightgrey") +
geom_point( data = no_core( d_b ), aes(color = type), size = 4, shape=19 ) +
geom_point( data = only_core( d_b ), aes(pch=model), col="darkgrey", fill="darkgrey", size=4 ) +
#labs( title = "Trails of Models from 1000 to 5000 training size",
# subtitle = "Aggregated over scenarios and queens" ) +
#scale_size_identity() +
scale_shape_manual(breaks = c( "ATE", "OLS S", "LASSO INF", "RF INF" ),
values = c( 15, 23, 25, 24 ) ) +
guides(color = guide_legend(title = "Model Type"),
shape = guide_legend(title = "Reference") ) +
coord_fixed( xlim=c(0, max_se) ) +
theme( legend.position = "none" )
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

contour_trail_to_5000.pdf
pltB + ggrepel::geom_text_repel(
aes(label = modelshort, col = type),
size = 3, # Increased label size
max.overlaps = Inf,
min.segment.length = Inf,
show.legend = FALSE
)

#ggsave( filename = "graphs/sree2024/contour_trail_to_5000.pdf", width = IMAGE_WIDTH, height = 5 )
# Binary outcome ----
bin <- df %>%
dplyr::filter( outcome_index == 2 )
table( bin$model )
##
## ATE BART S BART T CDML CF
## 102 102 102 102 102
## CF LC LASSO INF LASSO MCM LASSO MCM EA LASSO MOM DR
## 102 102 102 102 102
## LASSO MOM IPW LASSO R LASSO T OLS S RF INF
## 102 102 102 102 102
## RF MOM DR RF MOM IPW RF T SL S SL T
## 102 102 102 11 11
filter( df, outcome_index == 2, model == "LASSO INF" ) %>%
dplyr::select( -bias, -rmse, -se, -outcome_index )
filter( df, outcome_index == 2, model == "BART T" ) %>%
dplyr::select( -bias, -rmse, -se, -outcome_index )
#
# if ( FALSE ) {
#
# asap <- df %>%
# dplyr::filter( dataset == "asap" )
# table( asap$model )
#
# group_by( model, modelshort, type, baseline ) %>%
# summarise( n = n(),
# bias = sqrt( mean(bias^2 ) ),
# se = sqrt( mean( se^2 ) ),
# rmse = sqrt( mean( rmse^2 ) ), .groups = "drop" )
#
# bin <- df_agg %>%
# dplyr::filter( outcome_index == 2, dataset == "ca" )
#
# dd = bin
# max_bias = 0.15 #max( dd$bias )
# max_se = 0.15 #max( dd$se )
# p1C <- ggplot( no_core( dd ), aes(se, bias) ) +
# contour_plot_base( max_bias = max_bias, max_se = max_se, include_scales = FALSE, step = 0.025 ) +
# geom_point( aes(color = type), size = 4, shape=19 ) +
# geom_point( data = only_core( dd ), aes(pch=model), col="darkgrey", fill="darkgrey", size=4 ) +
# # geom_point( data = gbs, size = 7, shape = 1, col="red" ) +
# scale_shape_manual(breaks = c( "ATE", "OLS S", "LASSO INF", "RF INF" ),
# values = c( 15, 23, 25, 24 ) ) +
# guides(color = guide_legend(title = "Model Type"),
# shape = guide_legend(title = "Reference") ) +
# coord_fixed( xlim=c(0, max_se) ) +
# theme( legend.position = "none" )
#
# p1C + ggrepel::geom_text_repel(
# aes(label = modelshort, col = type),
# size = 3, # Increased label size
# max.overlaps = Inf,
# min.segment.length = Inf,
# show.legend = FALSE
# )
# ggsave( filename = "graphs/sree2024/binary_ca.pdf", width = IMAGE_WIDTH, height = 5 )
#
# }
# By queen ----
df <- mutate( df, is_ATE = ifelse( queen == "ATE", "ATE", "Not ATE" ) )
table( df$model )
##
## ATE BART S BART T CDML CF
## 198 198 198 198 198
## CF LC LASSO INF LASSO MCM LASSO MCM EA LASSO MOM DR
## 198 198 198 198 198
## LASSO MOM IPW LASSO R LASSO T OLS S RF INF
## 198 198 198 198 198
## RF MOM DR RF MOM IPW RF T SL S SL T
## 198 198 198 77 77
all_queen_plot.pdf
ggplot( df, aes(se, bias, xmax=max_se, ymax=max_bias, col=is_ATE, pch= ) ) +
facet_wrap( ~ model, nrow=3 ) +
contour_plot_base( max_bias = max_bias, max_se = max_bias,step= .1, include_scales = FALSE) +
geom_point( size = 1, shape=19, alpha=0.8 ) +
labs( #title = "All queens for all models across all CA sets",
color = "ATE Queen" ) +
scale_size_identity() +
scale_color_manual(breaks = c("ATE", "Not ATE"),
values = c( "red", "black" ) ) +
theme(panel.grid.minor = element_blank(),
legend.position = "none" )

#ggsave( filename = "graphs/sree2024/all_queen_plot.pdf", width = 11, height = 6 )
# Performance across 6 scenarios
table( ca1_agg$model )
##
## ATE BART S BART T CDML CF
## 9 9 9 9 9
## CF LC LASSO INF LASSO MCM LASSO MCM EA LASSO MOM DR
## 9 9 9 9 9
## LASSO MOM IPW LASSO R LASSO T OLS S RF INF
## 9 9 9 9 9
## RF MOM DR RF MOM IPW RF T SL S SL T
## 9 9 9 6 6
# ggplot( ca1_agg, aes( cov_set_size, rmse, group=as.factor(train_set_size), col=as.factor(train_set_size) ) ) +
# facet_wrap( ~model ) +
# geom_line() + geom_point() +
# theme_minimal() +
# theme( panel.grid.minor = element_blank() )
# Regression plots ----
# These plots plot the results of regressing rmse, se, and bias onto
# the simulation conditions and queen and model.
#
# We fit three models, and then make plots of the different families
# of coefficients.
# Prep data for regression by making ATE the reference category for
# both queen and model.
df
df_sub <- df %>%
dplyr::filter( outcome_index == 1,
dataset == "ca" ) %>%
mutate( model = relevel( factor( model ), ref = "ATE" ),
queen = relevel( factor( queen ), ref = "ATE" ),
cov_set_size = factor( cov_set_size, levels = c("small","medium","large" ) ),
N5000 = 0 + (train_set_size==5000),
) %>%
dplyr::filter( !str_detect( model, "INF"))
## Match queen and model for "home field advantage" flag ----
# table( df_sub$model )
# table( df_sub$queen )
#
# setdiff( unique( df_sub$model ),
# unique( df_sub$queen ) )
# This tries to get which models are "at home" with which queens.
#
# TODO: Update this to generate a "linear model/linear queen" match.
home <- function( model, queen ) {
if ( model == queen ) {
return( TRUE )
}
if ( model == "BART S" ) {
return (queen == "BART T" )
}
if ( str_detect( model, "LASSO" ) ) {
return( str_detect( queen, "LASSO" ) )
}
if ( model == "SL S" ) {
return (queen == "SL T" )
}
if ( str_detect( model, "RF MOM" ) ) {
return( queen == "RF MOM IPW" )
}
if ( model == "CF LC" ) {
return( queen == "CF" )
}
return( FALSE )
}
# testing code
home( df_sub$model[[4]], df_sub$queen[[4]] )
## [1] FALSE
# Calculate home for each model
df_sub <- mutate( df_sub,
home = map2_dbl( model, queen, home ) )
## Fit the linear models & make result tables ----
M_rmse = lm( rmse ~ 1 + model + queen + cov_set_size + N5000 + home, data = df_sub )
M_se = lm( se ~ 1 + model + queen + cov_set_size + N5000 + home, data = df_sub )
M_bias = lm( bias ~ 1 + model + queen + cov_set_size + N5000 + home, data = df_sub )
library( broom )
c_rmse <- tidy( M_rmse ) %>%
dplyr::select( term, estimate, std.error )
c_se <- tidy( M_se ) %>%
dplyr::select( term, estimate, std.error )
c_bias <- tidy( M_bias ) %>%
dplyr::select( term, estimate, std.error )
cs <- bind_rows( rmse = c_rmse, se = c_se, bias = c_bias, .id = "measure" )
cs$term = str_replace( cs$term, "cov_set_size", "cov_" )
cs$term = str_replace( cs$term, "queen", "Q " )
cs$term = str_replace( cs$term, "model", "model " )
table( cs$term )
##
## (Intercept) cov_large cov_medium home
## 3 3 3 3
## model BART S model BART T model CDML model CF
## 3 3 3 3
## model CF LC model LASSO MCM model LASSO MCM EA model LASSO MOM DR
## 3 3 3 3
## model LASSO MOM IPW model LASSO R model LASSO T model OLS S
## 3 3 3 3
## model RF MOM DR model RF MOM IPW model RF T model SL S
## 3 3 3 3
## model SL T N5000 Q BART T Q CDML
## 3 3 3 3
## Q CF Q LASSO MCM EA Q LASSO R Q OLS S
## 3 3 3 3
## Q RF MOM IPW Q RF T Q SL T Q XGBOOST R
## 3 3 3 3
cs <- cs %>%
dplyr::filter( term != "(Intercept)" ) %>%
mutate( queen = ifelse( str_detect( term, "Q " ), "queen",
ifelse( str_detect( term, "model" ), "model", "coef" ) ),
term = str_replace(term, "model |queen ", "" ) )
csA = cs
c_coef <- filter( csA,
term %in% c( "N5000", "cov_medium", "cov_large", "home", "(Intercept)" ) ) %>%
arrange( term ) %>%
dplyr::select( -queen ) %>%
relocate( term )
cs <- cs %>%
dplyr::filter( !term %in% c( "N5000", "cov_medium", "cov_large", "home", "(Intercept)" ) )
table( cs$term )
##
## BART S BART T CDML CF CF LC
## 3 3 3 3 3
## LASSO MCM LASSO MCM EA LASSO MOM DR LASSO MOM IPW LASSO R
## 3 3 3 3 3
## LASSO T OLS S Q BART T Q CDML Q CF
## 3 3 3 3 3
## Q LASSO MCM EA Q LASSO R Q OLS S Q RF MOM IPW Q RF T
## 3 3 3 3 3
## Q SL T Q XGBOOST R RF MOM DR RF MOM IPW RF T
## 3 3 3 3 3
## SL S SL T
## 3 3
# Sort the coefficients by performance in terms of rmse
ord = cs %>%
dplyr::filter( measure =="rmse" ) %>%
mutate( f = reorder( factor( term ), estimate ) )
ord$f
## [1] BART S BART T CDML CF CF LC
## [6] LASSO MCM LASSO MCM EA LASSO MOM DR LASSO MOM IPW LASSO R
## [11] LASSO T OLS S RF MOM DR RF MOM IPW RF T
## [16] SL S SL T Q BART T Q CDML Q CF
## [21] Q LASSO MCM EA Q LASSO R Q OLS S Q RF MOM IPW Q RF T
## [26] Q SL T Q XGBOOST R
## attr(,"scores")
## BART S BART T CDML CF CF LC
## 0.01844234775 0.08595900192 0.04698835786 -0.04369528076 -0.04197173434
## LASSO MCM LASSO MCM EA LASSO MOM DR LASSO MOM IPW LASSO R
## 0.02249857790 -0.03157334268 -0.03128625262 0.02267070807 -0.03164611671
## LASSO T OLS S Q BART T Q CDML Q CF
## -0.01501162791 0.03026792687 0.09281595111 0.09402909347 0.09134145489
## Q LASSO MCM EA Q LASSO R Q OLS S Q RF MOM IPW Q RF T
## 0.06716853747 0.06278324937 0.07478162497 0.06541522181 0.08782420665
## Q SL T Q XGBOOST R RF MOM DR RF MOM IPW RF T
## 0.08495553485 0.06373403149 -0.03255642065 0.03610600799 0.00003682072
## SL S SL T
## 0.01180908830 0.03770912044
## 27 Levels: CF CF LC RF MOM DR LASSO R LASSO MCM EA LASSO MOM DR ... Q CDML
cs <- mutate( cs,
term = factor( term, levels = levels(ord$f) ) )
cs
levels( cs$term[ cs$queen == "queen" ] )
## [1] "CF" "CF LC" "RF MOM DR" "LASSO R"
## [5] "LASSO MCM EA" "LASSO MOM DR" "LASSO T" "RF T"
## [9] "SL S" "BART S" "LASSO MCM" "LASSO MOM IPW"
## [13] "OLS S" "RF MOM IPW" "SL T" "CDML"
## [17] "Q LASSO R" "Q XGBOOST R" "Q RF MOM IPW" "Q LASSO MCM EA"
## [21] "Q OLS S" "Q SL T" "BART T" "Q RF T"
## [25] "Q CF" "Q BART T" "Q CDML"
## Make the final coefficient plots ----
# Plot model and queen on single plot
library( scales )
##
## Attaching package: 'scales'
## The following objects are masked from 'package:formattable':
##
## comma, percent, scientific
## The following object is masked from 'package:arm':
##
## rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor